/* 
Figure out a way to deal with [years with duplicates] duplicates. 
Flag which ones to keep. 

Get the full set of duplicates: dups[years with duplicates]. 
Then narrow down dups[years with duplicates] to the set that we should flag in pikyearspine. 

[] brackets indicate redacted internal census variable name or directory that can't be disclosed. 

*/

libname icf '[directory location of individual characteristics files]';
libname interm '/projects/users/########/Snapshot2022/IntermediateData';
libname tempdat '/projects/users/########/Snapshot2022/IntermediateData/TempData';
libname res21 '[directory location of geocoded worker residence files]';


* previously run;
/*
data tempdat.temp[years with duplicates];
  set res21.[residence file 1];
  if [year]~=[list of years with duplicates] then delete;
run;
*/

proc sort data = tempdat.temp[years with duplicates] nouniquekey out = tempdat.dups[years with duplicates];
 by pik [year];
run;

/*
FIRST get rid of true duplicates (duplicate in all fields, not just pik, year)
*/

* send true duplicates to their own dataset;
proc sort data = tempdat.dups[years with duplicates] nouniquekey out = tempdat.truedups[years with duplicates];
  by pik [year] [geocode] [latitude] [longitude] [quality flag for lat/lon];
run;

* create a true duplicates flag;
data tempdat.truedups[years with duplicates];
  set tempdat.truedups[years with duplicates];
  truedup = 1;
run;

* merge true duplicates back in so that they are flagged;
data tempdat.dups[years with duplicates];
  merge tempdat.dups[years with duplicates] tempdat.truedups[years with duplicates];
  by pik;
run;

* delete true duplicates and renumber quality flag for quality sorting; 
data tempdat.dups[years with duplicates];
  set tempdat.dups[years with duplicates];
  if truedup = 1 then delete;
  if [quality flag for lat/lon] = [flag value] then [quality flag for lat/lon] = [other flag value];
run;


/*
Get 2014 snapshot version of [years with duplicates] data for comparison. 
*/

* previously run;
/*
data tempdat.temp2014;
  set icf.icf_us_addresses;
  if [year]~=[years with duplicates] then delete;
run;
*/
data tempdat.temp2014;
  set tempdat.temp2014 (keep = pik [year] [county id] [latitude] [longitude]);
  rename [latitude] = lat2014;
  rename [longitude] = lon2014;
run;

* merge pik-[year]'s from 2014 snapshot; 
proc sql;
  create table tempdat.dups[years with duplicates] as select
    a.*, b.pik, b.[year], b.[county id], b.lat2014, b.lon2014
  from tempdat.dups[years with duplicates] as a 
    left join tempdat.temp2014 as b
    on a.pik = b.pik and a.[year] = b.[year]
    order by a.pik, a.[year];
quit;
* flag keepers if coordinates match; 
data tempdat.dups[years with duplicates];
  set tempdat.dups[years with duplicates];
  if [latitude] = lat2014 and [longitude] = lon2014 then keeper = 1;
    else keeper = 0;
run;

* flag those to drop by quality flag and keeper status (coordinate match to 2014 snapshot);
proc sort data = tempdat.dups[years with duplicates];
  by pik [quality flag for lat/lon] DESCENDING keeper;
run;
data tempdat.dups[years with duplicates];
  set tempdat.dups[years with duplicates];
  by pik;
  dropflag = 1;
  if first.pik then dropflag = 0;
run;

* Now merge dups[years with duplicates] to larger pikyearspine and drop those with dropflag = 1; 


